Load tidyverse and check that you are in the right working directory.
You can enter getwd() in the console window and see if the
path looks right.
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
In the lecture we looked at a map of the US and a matrix of distances between 9 major cities Let’s read in the matrix and have a quick look. For tidiness, I have saved it inside a folder called ‘data’
cities <- read_csv('data/city_distance.csv')
## Rows: 9 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): city
## dbl (9): BOS, NY, DC, MIA, CHI, SEA, SF, LA, DEN
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
glimpse(cities)
## Rows: 9
## Columns: 10
## $ city <chr> "BOS", "NY", "DC", "MIA", "CHI", "SEA", "SF", "LA", "DEN"
## $ BOS <dbl> 0, 206, 429, 1504, 963, 2976, 3095, 2979, 1949
## $ NY <dbl> 206, 0, 233, 1308, 802, 2815, 2934, 2786, 1771
## $ DC <dbl> 429, 233, 0, 1075, 671, 2684, 2799, 2631, 1616
## $ MIA <dbl> 1504, 1308, 1075, 0, 1329, 3273, 3053, 2687, 2037
## $ CHI <dbl> 963, 802, 671, 1329, 0, 2013, 2142, 2054, 996
## $ SEA <dbl> 2976, 2815, 2684, 3273, 2013, 0, 808, 1131, 1307
## $ SF <dbl> 3095, 2934, 2799, 3053, 2142, 808, 0, 379, 1235
## $ LA <dbl> 2979, 2786, 2631, 2687, 2054, 1131, 379, 0, 1059
## $ DEN <dbl> 1949, 1771, 1616, 2037, 996, 1307, 1235, 1059, 0
viewcities <- view(cities)
viewcities
## # A tibble: 9 × 10
## city BOS NY DC MIA CHI SEA SF LA DEN
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 BOS 0 206 429 1504 963 2976 3095 2979 1949
## 2 NY 206 0 233 1308 802 2815 2934 2786 1771
## 3 DC 429 233 0 1075 671 2684 2799 2631 1616
## 4 MIA 1504 1308 1075 0 1329 3273 3053 2687 2037
## 5 CHI 963 802 671 1329 0 2013 2142 2054 996
## 6 SEA 2976 2815 2684 3273 2013 0 808 1131 1307
## 7 SF 3095 2934 2799 3053 2142 808 0 379 1235
## 8 LA 2979 2786 2631 2687 2054 1131 379 0 1059
## 9 DEN 1949 1771 1616 2037 996 1307 1235 1059 0
We will make a new dataframe called md_cities. We remove the column
called city to leave only numeric data. We use
cmdscale to create a matrix reduced to 2 dimensions.
Convert the matrix back to a dataframe and add the column
city back for plotting.
md_cities <- select(cities, -city) %>%
cmdscale() %>%
as.data.frame()
md_cities$city_name <- cities$city
glimpse(md_cities)
## Rows: 9
## Columns: 3
## $ V1 <dbl> -1348.6683, -1198.8741, -1076.9855, -1226.9390, -428.4548, 1…
## $ V2 <dbl> -462.40060, -306.54690, -136.43204, 1013.62838, -174.60316, …
## $ city_name <chr> "BOS", "NY", "DC", "MIA", "CHI", "SEA", "SF", "LA", "DEN"
viewmdcities <- view(md_cities)
viewmdcities
## V1 V2 city_name
## 1 -1348.6683 -462.40060 BOS
## 2 -1198.8741 -306.54690 NY
## 3 -1076.9855 -136.43204 DC
## 4 -1226.9390 1013.62838 MIA
## 5 -428.4548 -174.60316 CHI
## 6 1596.1594 -639.30777 SEA
## 7 1697.2283 131.68586 SF
## 8 1464.0470 560.58046 LA
## 9 522.4871 13.39576 DEN
Now we can plot x and y to recreate the map. Ooops - its upsidedown.
ggplot(md_cities, aes(x = V1, y = V2)) +
geom_text(aes(label = city_name))
Turn it back the right way by reversing
ggplot(md_cities, aes(x = V1, y = V2)) +
geom_text(aes(label = city_name)) +
scale_x_reverse() +
scale_y_reverse()
National Student Survey Results
OPTIONAL: You can open up the R script called “2_week_04_NSS24_preprocess.R” If you run the whole script, it will read in a csv of NSS data (from inside the ‘data’ folder), pre-process it, and save the clean version as a csv file (back into the data folder), and also as an R object.
I have already made the R object for you which you can just load directly.
load('./nss_agree.rda')
glimpse(NSS24)
## Rows: 411
## Columns: 10
## $ ukprn <chr> "10042570", "10067853", "10000163", "10007849", "10007856", "…
## $ provider <chr> "AAP Education Limited", "ACM Guildford Limited", "AECC Unive…
## $ Scale01 <dbl> 0.785, 0.765, 0.788, 0.872, 0.886, 0.955, 0.683, 0.958, 0.827…
## $ Scale02 <dbl> 0.736, 0.740, 0.686, 0.838, 0.867, 1.000, 0.635, 0.888, 0.806…
## $ Scale03 <dbl> 0.768, 0.773, 0.476, 0.835, 0.848, 0.960, 0.740, 0.838, 0.787…
## $ Scale04 <dbl> 0.780, 0.782, 0.836, 0.905, 0.913, 1.000, 0.800, 0.938, 0.810…
## $ Scale05 <dbl> 0.493, 0.538, 0.438, 0.799, 0.853, 0.800, 0.425, 0.750, 0.675…
## $ Scale06 <dbl> 0.485, 0.712, 0.824, 0.907, 0.916, 0.758, 0.628, 0.747, 0.854…
## $ Scale07 <dbl> 0.594, 0.626, 0.540, 0.780, 0.803, 0.933, 0.533, 0.778, 0.723…
## $ exeter <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE…
nss_agree <- NSS24
nss_exeter <- nss_agree %>%
filter(provider == 'University of Exeter')
print(nss_exeter)
## # A tibble: 1 × 10
## ukprn provider Scale01 Scale02 Scale03 Scale04 Scale05 Scale06 Scale07 exeter
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <lgl>
## 1 10007… Univers… 0.869 0.802 0.71 0.852 0.812 0.867 0.722 TRUE
nss_lse <- nss_agree %>%
filter(provider == 'The London School of Economics and Political Science')
print(nss_lse)
## # A tibble: 1 × 10
## ukprn provider Scale01 Scale02 Scale03 Scale04 Scale05 Scale06 Scale07 exeter
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <lgl>
## 1 10004… The Lon… 0.875 0.801 0.732 0.877 0.82 0.886 0.741 FALSE
Take the difference of each item, square each difference and add it all up. Take the squareroot.
sqrt((nss_exeter$Scale01 - nss_lse$Scale01)^2 +
(nss_exeter$Scale02 - nss_lse$Scale02)^2 +
(nss_exeter$Scale03 - nss_lse$Scale03)^2 +
(nss_exeter$Scale04 - nss_lse$Scale04)^2 +
(nss_exeter$Scale05 - nss_lse$Scale05)^2 +
(nss_exeter$Scale06 - nss_lse$Scale06)^2 +
(nss_exeter$Scale07 - nss_lse$Scale07)^2)
## [1] 0.04395452
nss_bristol <- nss_agree %>%
filter(provider == 'University of Bristol')
print(nss_bristol)
## # A tibble: 1 × 10
## ukprn provider Scale01 Scale02 Scale03 Scale04 Scale05 Scale06 Scale07 exeter
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <lgl>
## 1 10007… Univers… 0.856 0.788 0.686 0.85 0.727 0.864 0.718 FALSE
sqrt((nss_exeter$Scale01 - nss_bristol$Scale01)^2 +
(nss_exeter$Scale02 - nss_bristol$Scale02)^2 +
(nss_exeter$Scale03 - nss_bristol$Scale03)^2 +
(nss_exeter$Scale04 - nss_bristol$Scale04)^2 +
(nss_exeter$Scale05 - nss_bristol$Scale05)^2 +
(nss_exeter$Scale06 - nss_bristol$Scale06)^2 +
(nss_exeter$Scale07 - nss_bristol$Scale07)^2)
## [1] 0.09052624
nss_agree %>% filter(grepl('*Bristol*', provider))
## # A tibble: 4 × 10
## ukprn provider Scale01 Scale02 Scale03 Scale04 Scale05 Scale06 Scale07 exeter
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <lgl>
## 1 10001… City of… 0.956 0.906 0.918 0.912 0.882 0.392 0.804 FALSE
## 2 10023… Trinity… 1 1 0.982 1 1 1 1 FALSE
## 3 10007… Univers… 0.856 0.788 0.686 0.85 0.727 0.864 0.718 FALSE
## 4 10007… Univers… 0.836 0.813 0.758 0.832 0.725 0.873 0.714 FALSE
Compute all the distances simultaneously - need to do a bit of re-shaping. Requires numeric data only so need to create a numeric matrix.
nss_m <- nss_agree %>%
select(-ukprn, -provider, -exeter) %>%
as.matrix
rownames(nss_m) <- nss_agree$provider
glimpse(nss_m)
## num [1:411, 1:7] 0.785 0.765 0.788 0.872 0.886 0.955 0.683 0.958 0.827 0.958 ...
## - attr(*, "dimnames")=List of 2
## ..$ : chr [1:411] "AAP Education Limited" "ACM Guildford Limited" "AECC University College" "Abertay University" ...
## ..$ : chr [1:7] "Scale01" "Scale02" "Scale03" "Scale04" ...
All pairs of Euclidean distances - check the size of the output
nss_dist <- dist(nss_m)
If we turn this into a matrix, and look at only the first three schools we see the distances. Along the diagonal the distance is 0, because that’s the distance of a school to itself.
as.matrix(nss_dist)[1:3, 1:3]
## AAP Education Limited ACM Guildford Limited
## AAP Education Limited 0.0000000 0.2345698
## ACM Guildford Limited 0.2345698 0.0000000
## AECC University College 0.4601858 0.3528597
## AECC University College
## AAP Education Limited 0.4601858
## ACM Guildford Limited 0.3528597
## AECC University College 0.0000000
Quick summary stats of the distances.
summary(nss_dist)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.01483 0.18326 0.27121 0.31168 0.39705 1.38850
Here’s a histogram of the distances. They tend to be between 0 and 0.5.
as.numeric(nss_dist) %>%
hist(breaks = 100, main = "Histogram of all distances")
Which University is the data is the most different from the University of Exeter? Which is the most similar?
as.matrix(nss_dist)[,“University of Exeter”] will produce a list of all the schools. Can you work out the highest and lowest from the code below?
#your code here
matrix_list <- as.matrix(nss_dist)[,"University of Exeter"]
matrix_list[matrix_list ==0] <- NA
most_similar <- which.min(matrix_list)
most_different <- which.max(matrix_list)
as.matrix(nss_dist)[,"University of Exeter"] %>% sort %>% head
## University of Exeter
## 0.00000000
## University of Sussex
## 0.03196873
## The London School of Economics and Political Science
## 0.04395452
## University of Nottingham, The
## 0.05233546
## University of Southampton
## 0.05273519
## Queen's University of Belfast
## 0.06038212
nss_mds <- cmdscale(nss_dist) %>%
as.data.frame() %>%
rownames_to_column(var = 'provider') %>%
rename(mds1 = V1, mds2 = V2)
ggplot(nss_mds, aes(x = mds1, y = mds2)) +
geom_point()
Adding text is a total mess
ggplot(nss_mds, aes(x = mds1, y = mds2)) +
geom_point() +
geom_text(aes(label=provider))
We can use ‘filter’ to select points of interest for labeling
library(ggrepel)
ggplot(nss_mds, aes(x = mds1, y = mds2)) +
geom_point() +
geom_text_repel(data = filter(nss_mds, mds1 > 0.45),
aes(label = provider))
Any outliers outside a unit circle? This will label only rows where the Euclideon distance is from the origin is >1
ggplot(nss_mds, aes(x = mds1, y = mds2)) +
geom_point() +
geom_text_repel(data = filter(nss_mds,sqrt(mds1^2 + mds2^2) >1),
aes(label = provider))
Find University of Exeter
ggplot(nss_mds, aes(x = mds1, y = mds2)) +
geom_point() +
geom_text_repel(data = filter(nss_mds,provider=='University of Exeter'),
aes(label = provider))
ggplot(nss_mds, aes(x = mds1, y = mds2)) +
geom_point(aes(color = ifelse(provider == 'University of Exeter', 'Exeter', 'Other'))) +
scale_color_manual(values = c('Exeter' = 'red', 'Other' = 'black')) +
geom_text_repel(data = filter(nss_mds, provider == 'University of Exeter'),
aes(label = provider),
nudge_x = 0.5, # Optional: Adjust text nudge to avoid overlap
nudge_y = 0.5) + # Optional: Adjust text nudge to avoid overlap
theme(legend.position = "none") # Hide the legend, assuming it's not needed
library(corrplot)
## corrplot 0.95 loaded
nss_cor <- nss_agree %>%
select_if(is.numeric) %>%
cor
What do you think about the correlated relationships? Which variable stands out from all the others? Can you think why? You may need to look at the original dataset to see what each of the themes are.
corrplot(nss_cor)
If you really don’t like that the default label colour and style:
corrplot(nss_cor, method = 'color', tl.col = 'grey20')
Create a scatter plot of Scale01 against Scale02
ggplot(nss_agree, aes(x = Scale01, y = Scale02)) +
geom_point()
Create scatter plots for all pairs - you’ll use this for your final
report
library(GGally)
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
nss_agree %>%
select_if(is.numeric) %>%
ggpairs()
Another way to look at some other plots ## 3D interactive plot
Can you find Exeter?
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
https://plot.ly/r/3d-scatter-plots/
plot_ly(nss_agree, x = ~Scale01, y = ~Scale02, z = ~Scale06, color = ~exeter, size = 2,
text = ~paste('University:', provider)) %>%
add_markers()
## Warning in RColorBrewer::brewer.pal(N, "Set2"): minimal value for n is 3, returning requested palette with 3 different levels
## Warning in RColorBrewer::brewer.pal(N, "Set2"): minimal value for n is 3, returning requested palette with 3 different levels
library(FactoMineR)
nss_pca <- PCA(nss_m)
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
It looks like every single school can be described by a single number, they can all be ranked on a single dimension!
fviz_eig(nss_pca)
fviz_pca_var(nss_pca)
Lollipop plot.
nss_pca$var$contrib %>%
as.data.frame %>%
rownames_to_column(var = 'scale') %>%
gather(pca, value, -scale) %>%
ggplot() +
geom_point(aes(y = scale, x = value)) +
geom_segment(x = 0, aes(xend = value, y = scale, yend = scale)) +
facet_wrap(~ pca)
fviz_pca_ind(nss_pca)
fviz_pca_ind(nss_pca, geom = 'point')
fviz_pca_biplot(nss_pca, geom = 'point')
Out of interest, let’s look at Dimensions 2 and 3
fviz_pca_biplot(nss_pca, geom = 'point', axes = c(2,3))
Merge in the PCA results.
nss_pca_ind <-
nss_pca$ind %>%
as.data.frame() %>%
rownames_to_column(var = 'provider')
nss <- left_join(nss_agree, nss_pca_ind, by = 'provider')
Plot in 3D using first three dimensions
plot_ly(nss, x = ~coord.Dim.1, y = ~coord.Dim.2, z = ~coord.Dim.3, color = ~exeter,
size = 2, colors = c('#2C5B9B', '#F37267'),
text = ~paste('University:', provider)) %>%
add_markers()
cities_hc <- hclust(select(cities, -city) %>% as.dist, method = 'ave')
plot(cities_hc)
Play around with the miles_apart value. What is the best
place to cut the tree to give you a good clustering?
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
miles_apart <- 300
md_cities$cluster <- cutree(cities_hc, h = miles_apart)
p1 <- ggplot(md_cities, aes(x = V1, y = V2, color = as.factor(cluster))) +
scale_x_reverse('MDS 1') +
scale_y_reverse('MDS 2') +
geom_text(aes(label = city_name)) +
theme_bw()
p2 <- fviz_dend(cities_hc) +
geom_hline(yintercept = miles_apart, linetype = 2)
## Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as
## of ggplot2 3.3.4.
## ℹ The deprecated feature was likely used in the factoextra package.
## Please report the issue at <https://github.com/kassambara/factoextra/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
grid.arrange(p1, p2, nrow = 1)
What happens when we do use hierarchical clustering on the most recent NSS data?
nss_hc <- hclust(nss_dist)
nss_hc$labels <- abbreviate(nss_hc$labels, minlength = 10) %>% as.character
plot(nss_hc)
fviz_dend(nss_hc, rotate = T) +
theme(text = element_text(size = 10))
nss_d <- select(nss_agree, starts_with('Scale')) %>%
data.frame
rownames(nss_d) <- nss_agree$provider
Try varying the number of clusters by changing k to
different values. NOTE: These are randomly generated clusters
How does it impact the WSS and Silhouette recommendations for the number of clusters? How many clusters would you choose compared to how many it recommends?
#set seed so we get the same results each time
set.seed(666)
# Parameters
k <- 5 # how many clusters do we want?
N <- 500
# Generate cluster centers
xs <- runif(k, 0, 10)
ys <- runif(k, 0, 10)
# Generate data for each cluster
d <- data.frame(
xs = unlist(lapply(xs, function(x) rnorm(N, mean = x, sd = 0.5))),
ys = unlist(lapply(ys, function(y) rnorm(N, mean = y, sd = 0.5))),
cluster = factor(rep(1:k, each = N)) # Add a 'cluster' column
)
# Plot the data
ggplot(d, aes(x = xs, y = ys, color = cluster)) +
geom_point(alpha = 0.5) +
theme_minimal() +
scale_color_discrete(name = "Cluster")
fviz_nbclust(d,kmeans, method = 'wss')
fviz_nbclust(d,kmeans, method = 'silhouette')
Now let’s do k-means clustering on our generated dataset. Run the cell multiple times - what do you see?
The initial placement of centroids affects the final clustering result.
# Perform k-means clustering (use only numeric columns)
kmeans_result <- kmeans(d[, c("xs", "ys")], centers = k)
# Visualize clusters using factoextra
fviz_cluster(kmeans_result, data = d[, c("xs", "ys")], geom = "point")
fviz_nbclust(nss_d,hcut, method = 'wss')
fviz_nbclust(nss_d,kmeans, method = 'wss')
fviz_nbclust(nss_d,hcut, method = 'silhouette')
fviz_nbclust(nss_d,kmeans, method = 'silhouette')
Play with the number of clusters. What happens at N = 7?
The overlap in your plot when visualising k-means is due to the underlying data distribution. As k-means works by miminisng the sum of squared distances between data points and their assigned cluster centroids, it assumes clusters are roughly spherical, equally sized, and non-overlapping. The dimension appear to overlap when visualised in 2D.
Hierarchical clustering can capture more complex cluster shapes.
N <- 3
nss_km <- kmeans(nss_d, centers = N, nstart = 20)
p1 <- fviz_cluster(nss_km, nss_d, geom = 'point') +
theme_minimal() +
labs(title = paste('K-Means Clustering (k =',N,')'))
nss_hcut <- hcut(nss_d, k = N)
p2 <- fviz_cluster(nss_hcut, nss_d, geom = 'point') +
theme_minimal() +
labs(title = paste('Hierarchical Clustering (k =',N,')'))
grid.arrange(p1, p2)
Instead of a scatterplot showing the clusters in 2D space, we are using a density plot that shows distribution of data points along a single dimension. Each cluster is represented as a single density curve. You can see how much overlap exists between clusters in dimension 1.
nss_km <- kmeans(nss_d, centers = N, nstart = 20)
nss$km_cluster <- factor(nss_km$cluster)
ggplot(nss, aes(x = coord.Dim.1, fill = km_cluster)) +
geom_density(color = 'white', alpha = 0.5) +
theme_minimal()
nss_km$centers
## Scale01 Scale02 Scale03 Scale04 Scale05 Scale06 Scale07
## 1 0.7902308 0.7520577 0.7078462 0.7730962 0.4778077 0.6826154 0.6057308
## 2 0.8644980 0.8353306 0.8038327 0.8636939 0.7274939 0.8360000 0.7418122
## 3 0.9481930 0.9303421 0.9196930 0.9448246 0.8805965 0.8764737 0.8780526
plot_ly(nss, x = ~Scale01, y = ~Scale02, z = ~Scale06, color = ~km_cluster, size = 2,
text = ~paste('University:', provider)) %>%
add_markers()
plot_ly(nss, x = ~coord.Dim.1, y = ~coord.Dim.2, z = ~coord.Dim.3, color = ~km_cluster, size = 2,
text = ~paste('University:', provider)) %>%
add_markers()
which cluster is Exeter in ?
nss %>% filter(provider == "University of Exeter") %>% pull(km_cluster)
## University of Exeter
## 2
## Levels: 1 2 3
nss %>% filter(provider == "University of Exeter") %>% pull(coord.Dim.1)
## [1] -0.776422
We can filter parts of the dataset if we want to look in more detail
high_dim <-nss %>% filter(coord.Dim.1 < -4)
Let’s filter to visualise some of the universities close to Exeter using just PCA dimension 1
# Define the range around University of Exeter's coord.Dim.1 value
exeter_coord <- -0.77
range <- 0.2 # Adjust this range as needed
# Filter universities within the specified range
near_exeter <- nss %>%
filter(coord.Dim.1 > (exeter_coord - range) & coord.Dim.1 < (exeter_coord + range))
# Plot the filtered universities
ggplot(near_exeter, aes(x = coord.Dim.1, y = coord.Dim.2, label = provider)) +
geom_point(color = "blue", size = 3) +
geom_text_repel() +
theme_minimal() +
labs(
x = "Coordinate Dimension 1",
y = "Coordinate Dimension 2"
)
## Warning: ggrepel: 3 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps